{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "gewjvw0krZOX"
   },
   "source": [
    "# Mask-RCNN Label Generation\n",
    "\n",
    "## Notebook Setup \n",
    "The following cell will install Drake, checkout the manipulation repository, and set up the path (only if necessary).\n",
    "- On Google's Colaboratory, this **will take approximately two minutes** on the first time it runs (to provision the machine), but should only need to reinstall once every 12 hours.  \n",
    "\n",
    "More details are available [here](http://manipulation.mit.edu/drake.html)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "4budwoxsrZOY"
   },
   "outputs": [],
   "source": [
    "import importlib\n",
    "import os, sys\n",
    "from urllib.request import urlretrieve\n",
    "\n",
    "if 'google.colab' in sys.modules and importlib.util.find_spec('manipulation') is None:\n",
    "    urlretrieve(f\"http://manipulation.csail.mit.edu/scripts/setup/setup_manipulation_colab.py\",\n",
    "                \"setup_manipulation_colab.py\")\n",
    "    from setup_manipulation_colab import setup_manipulation\n",
    "    setup_manipulation(manipulation_sha='9334a842594b94b69b6ec59d7806e7618a19206d', drake_version='20201020', drake_build='nightly')\n",
    "\n",
    "from IPython import get_ipython\n",
    "running_as_notebook = get_ipython() and hasattr(get_ipython(), 'kernel')\n",
    "\n",
    "# Setup rendering (with xvfb), if necessary:\n",
    "import os\n",
    "if 'google.colab' in sys.modules and os.getenv(\"DISPLAY\") is None:\n",
    "    from pyvirtualdisplay import Display\n",
    "    display = Display(visible=0, size=(1400, 900))\n",
    "    display.start()\n",
    "\n",
    "# setup ngrok server\n",
    "server_args = []\n",
    "if 'google.colab' in sys.modules:\n",
    "  server_args = ['--ngrok_http_tunnel']\n",
    "\n",
    "from meshcat.servers.zmqserver import start_zmq_server_as_subprocess\n",
    "proc, zmq_url, web_url = start_zmq_server_as_subprocess(server_args=server_args)\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "from pydrake.all import ( \n",
    "    AddMultibodyPlantSceneGraph, ConnectMeshcatVisualizer, \n",
    "    DiagramBuilder, RigidTransform, RotationMatrix, Parser, Simulator,\n",
    "    FindResourceOrThrow,Quaternion, AddTriad\n",
    ")\n",
    "\n",
    "import open3d as o3d\n",
    "import meshcat\n",
    "import meshcat.geometry as g\n",
    "import meshcat.transformations as tf\n",
    "\n",
    "from manipulation.meshcat_utils import draw_open3d_point_cloud, draw_points\n",
    "from manipulation.mustard_depth_camera_example import MustardExampleSystem\n",
    "from manipulation.utils import FindResource\n",
    "from manipulation.open3d_utils import create_open3d_point_cloud\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib.patches import Rectangle\n",
    "from copy import deepcopy\n",
    "import pydot\n",
    "from ipywidgets import Dropdown, Layout\n",
    "from IPython.display import display, HTML, SVG"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Hpzje8Y0rZOd",
    "outputId": "41c123c5-3303-4dda-c7ca-0e1d997c9e2c"
   },
   "outputs": [],
   "source": [
    "class SimpleCameraSystem:\n",
    "    def __init__(self):\n",
    "        diagram = MustardExampleSystem()\n",
    "        context = diagram.CreateDefaultContext()\n",
    "        \n",
    "        # setup\n",
    "        self.meshcat_vis = meshcat.Visualizer(zmq_url=zmq_url)\n",
    "        self.meshcat_vis[\"/Background\"].set_property(\"visible\", False)\n",
    "        \n",
    "        # getting data\n",
    "        self.point_cloud = diagram.GetOutputPort(\"camera0_point_cloud\").Eval(context)\n",
    "        self.depth_im_read = diagram.GetOutputPort('camera0_depth_image').Eval(context).data.squeeze()\n",
    "        self.depth_im = deepcopy(self.depth_im_read)\n",
    "        self.depth_im[self.depth_im == np.inf] = 10.0\n",
    "        label_im = diagram.GetOutputPort('camera0_label_image').Eval(context).data.squeeze()\n",
    "        self.rgb_im = diagram.GetOutputPort('camera0_rgb_image').Eval(context).data\n",
    "        self.mask = label_im == 1\n",
    "\n",
    "        # draw visualization\n",
    "        pcd = create_open3d_point_cloud(self.point_cloud)\n",
    "        draw_open3d_point_cloud(self.meshcat_vis[\"self.point_cloud\"], pcd)\n",
    "        \n",
    "        # camera specs\n",
    "        cam0 = diagram.GetSubsystemByName('camera0')\n",
    "        cam0_context = cam0.GetMyMutableContextFromRoot(context)\n",
    "        cam0_pose = cam0.GetOutputPort('X_WB').Eval(cam0_context)\n",
    "        self.cam_info = cam0.depth_camera_info()\n",
    "        self.X_WC = RigidTransform(quaternion=Quaternion(cam0_pose[3:]), p=cam0_pose[0:3])\n",
    "\n",
    "        # get points for mustard bottle\n",
    "        depth_mustard = self.mask * self.depth_im\n",
    "        u_range = np.arange(depth_mustard.shape[0])\n",
    "        v_range = np.arange(depth_mustard.shape[1])\n",
    "        depth_v, depth_u = np.meshgrid(v_range, u_range)\n",
    "        depth_pnts = np.dstack([depth_u, depth_v, depth_mustard])\n",
    "        depth_pnts = depth_pnts.reshape([depth_pnts.shape[0]*depth_pnts.shape[1], 3])\n",
    "        pC =self.project_depth_to_pC(depth_pnts)\n",
    "        p_C_mustard = pC[pC[:,2] > 0]\n",
    "        self.p_W_mustard = self.X_WC.multiply(p_C_mustard.T).T\n",
    "    \n",
    "    def get_color_image(self):\n",
    "        return deepcopy(self.rgb_im[:,:,0:3])\n",
    "    \n",
    "    def get_intrinsics(self):\n",
    "        # read camera intrinsics\n",
    "        cx = self.cam_info.center_x()\n",
    "        cy = self.cam_info.center_y()\n",
    "        fx = self.cam_info.focal_x()\n",
    "        fy = self.cam_info.focal_y()\n",
    "        return cx, cy, fx, fy\n",
    "    def project_depth_to_pC(self, depth_pixel):\n",
    "        \"\"\"\n",
    "        project depth pixels to points in camera frame\n",
    "        using pinhole camera model\n",
    "        Input:\n",
    "            depth_pixels: numpy array of (nx3) or (3,)\n",
    "        Output:\n",
    "            pC: 3D point in camera frame, numpy array of (nx3)\n",
    "        \"\"\"\n",
    "        # switch u,v due to python convention\n",
    "        v = depth_pixel[:,0]\n",
    "        u = depth_pixel[:,1]\n",
    "        Z = depth_pixel[:,2]\n",
    "        cx, cy, fx, fy = self.get_intrinsics()\n",
    "        X = (u-cx) * Z/fx\n",
    "        Y = (v-cy) * Z/fy\n",
    "        pC = np.c_[X,Y,Z]\n",
    "        return pC\n",
    "    \n",
    "def bbox(img):\n",
    "    a = np.where(img != 0)\n",
    "    bbox = ([np.min(a[0]), np.max(a[0])], [np.min(a[1]), np.max(a[1])])\n",
    "    return bbox\n",
    "        \n",
    "env = SimpleCameraSystem()\n",
    "X_WC = env.X_WC\n",
    "p_W_mustard = env.p_W_mustard\n",
    "K = env.cam_info.intrinsic_matrix()\n",
    "rgb_im = env.get_color_image()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "2fj0D-mHrZOn"
   },
   "source": [
    "# Generate Mask Labels\n",
    "\n",
    "In the lecture, you have learned about Mask-RCNN. A major difficulty in training/fine-tuning Mask-RCNN is to obtain high-quality real training data, especially the mask labels for the objects of interest. Although you can get training labels from [Amazon Mechanical Turk](https://www.mturk.com/), it is a paid service and you will have to wait for some time until you get your data labeled. An alternative method is to design clever pipelines to generate labeled masks automatically without requiring manual labor.  \n",
    "\n",
    "Consider a setup where an object of interest is placed on a planar surface, and an RGBD camera is mounted at a fixed location pointing to the object. From the RGBD camera, you should be able to generate the corresponding point clouds of the desired object and the surrounding environment (e.g. planar surface). You can easily remove the points associated with the planar surface (recall RANSAC exercise in the problem set 2). The remaining points then should all belong to the desired object. To generate mask labels, all you need to do is to project the points back to the camera image plane using the pinhole camera model!\n",
    "\n",
    "Let's quickly review the pinhole camera model!\n",
    "\n",
    "In the last problem set, you have played with [pinhole camera model](https://docs.opencv.org/3.4/d9/d0c/group__calib3d.html). In particular, you used the pinhole camera model to map the depth pixels to 3D points. You may review the `SimpleCameraSystem` class from the last problem set.\n",
    "\n",
    "The mathematical description of the pinhole camera model is written below (you may also use the intrinsics matrix by `env.cam_info.intrinsic_matrix()`).\n",
    "\n",
    "The camera intrinsics are:\n",
    "\\begin{equation}\n",
    "X_c = (u-c_x)\\frac{Z_c}{f_x}\n",
    "\\end{equation}\n",
    "\n",
    "\\begin{equation}\n",
    "Y_c = (v-c_y)\\frac{Z_c}{f_y}\n",
    "\\end{equation}\n",
    "\n",
    "Notations:\n",
    "- $f_x$: focal length in x direction\n",
    "- $f_y$: focal length in y direction\n",
    "- $c_x$: principal point in x direction (pixels)\n",
    "- $c_y$: principal point in y direction (pixels)\n",
    "- $(X_C, Y_C, Z_C)$: points in camera frame\n",
    "\n",
    "where $f_x$, $f_y$, $c_x$, $c_y$ specify the intrinsics of the camera.\n",
    "\n",
    "The diagram of the pinhole camera model below is found from [OpenCV documentation](https://docs.opencv.org/3.4/d9/d0c/group__calib3d.html). Note that the $u$ and $v$ directions are reversed in Python due to the difference in the convention."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "OaXlVRcskbDf"
   },
   "source": [
    "![](https://docs.opencv.org/3.4/pinhole_camera_model.png)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "5lOkEMMArZOo"
   },
   "source": [
    "## Generate Mask from Point Clouds\n",
    "\n",
    "Now given the points of the mustard bottle in the world frame p_W_mustard, can you re-project these points back to the image plane to construct the mask of the mustard bottle? Note that you may need to reverse u,v indices to get the mask of the mustard bottle upright. Your mask should be of the same size as the original depth image, which is (480, 640)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "uSvk2b-WrZOp"
   },
   "outputs": [],
   "source": [
    "def deproject_pW_to_image(p_W_mustard, cx, cy, fx, fy, X_WC):\n",
    "    \"\"\"\n",
    "    convert points in the world frame to camera pixels\n",
    "    Input:\n",
    "        - p_W_mustard: points of the mustard bottle in world frame (nx3)\n",
    "        - fx, fy, cx, cy: camera intrinsics\n",
    "        - X_WC: camera pose in the world frame \n",
    "    Output:\n",
    "        - mask: numpy array of size 480x640\n",
    "    \"\"\"\n",
    "\n",
    "    mask = np.zeros([480, 640])\n",
    "    return mask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "X8-LRTNMrZOt",
    "outputId": "a09f2d5e-c051-445f-f118-6f0e635c89c3",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "cx, cy, fx, fy = env.get_intrinsics()\n",
    "mask = deproject_pW_to_image(p_W_mustard, cx, cy, fx, fy, X_WC)\n",
    "plt.imshow(mask)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "jAoTk3YfrZOy"
   },
   "source": [
    "You should be able to visually verify that the generated mask perfectly align with the mustard bottle!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "jlUvId9TrZOy",
    "outputId": "4c486ab2-180f-4f23-b787-b409b04b97cf"
   },
   "outputs": [],
   "source": [
    "plt.imshow(rgb_im)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "PmMxCE6KrZO3",
    "outputId": "95511567-dbd7-4b88-b0ca-65aa5c6d4b39"
   },
   "outputs": [],
   "source": [
    "masked_rgb = rgb_im * mask[:,:,np.newaxis].astype(int) \n",
    "plt.imshow(masked_rgb)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "sJSzxkPzOrwc"
   },
   "source": [
    "# Analysis for Cluttered Scenes\n",
    "**Now you have a working pipeline for generating mask labels for single objects on a planar surface. Now consider besides a mustard bottle, you also have a Cheez-It box in the scene. Both objects appear in your RGBD image. Answer the following questions.**\n",
    "\n",
    "**6.1.b** Assume the Cheez-It box is not occluding the mustard bottle, without any modification to the pipeline above, can you get the separate masks of the mustard bottle and Cheez-It box? Explain your reasoning.\n",
    "\n",
    "**6.1.c** Assume the Cheez-It box is occluding the mustard bottle, without any modification to the pipeline above, can you get the separate masks of the mustard bottle and Cheez-It box? Explain your reasoning."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "kVAB7pkurZO7"
   },
   "source": [
    "# Generate Training Images and Masks via Data Augmentation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "nyiEhiNTrZO8"
   },
   "source": [
    "[Data augmentation](https://en.wikipedia.org/wiki/Data_augmentation) is commonly used to generate more training data from the existing data. For example, a common trick to generate training images and masks for occluded scenes is to randomly insert rendered objects on top of the real image. Similarly, you can randomly scale, flip, rotate, duplicate, and crop to \"simulate\" more complex scenes.  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "AQ_P53yesblo"
   },
   "source": [
    "<figure>\n",
    "<center>\n",
    "<img src='https://developers.google.com/machine-learning/practica/image-classification/images/data_augmentation.png' />\n",
    "<figcaption>Example Image Data Augmentation (credit: developers.google.com)</figcaption></center>\n",
    "</figure>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Zs3arsiQsY_a"
   },
   "source": [
    "In this exercise, we ask you to explore different ways to augment from our existing mustard bottle image:\n",
    "- flipping\n",
    "- translating\n",
    "- duplication \n",
    "- cropping\n",
    "- adding noise \n",
    "\n",
    "**Please complete the function below to generate 1 more pair of a color image and mask label using at least 2 tricks above to augment your data. You may use Numpy only!** \n",
    "\n",
    "**Note: make sure you display both of the your new image and mask below in your notebook submission. Your results will be visually graded**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "UTziZ7y5rZO9"
   },
   "outputs": [],
   "source": [
    "def augment_mustard_bottle(rgb_im, mask):\n",
    "    \"\"\"\n",
    "    perform random rotation, scaling, and duplication to generate\n",
    "    more training images and labels\n",
    "    rgb_im: original rgb image of the mustard bottle\n",
    "    mask: binay mask of the mustard bottle\n",
    "    \"\"\"\n",
    "    augmented_rgb = np.zeros((480, 640, 3))\n",
    "    augmented_mask = np.zeros((480, 640))\n",
    "    return augmented_rgb, augmented_mask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "3f_Z3aWErZPE"
   },
   "outputs": [],
   "source": [
    "new_img, new_mask = augment_mustard_bottle(rgb_im, mask)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "h18zQE06rZPI",
    "outputId": "998c4c48-044d-43df-e615-5600d2514bd6"
   },
   "outputs": [],
   "source": [
    "plt.imshow(new_img)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "jcbFVSugrZPM",
    "outputId": "e05d2d9d-def0-4f7a-863e-64cdcba8074e"
   },
   "outputs": [],
   "source": [
    "plt.imshow(new_mask)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "MwE8yNg58VQN"
   },
   "source": [
    "## How will this notebook be Graded?\n",
    "\n",
    "If you are enrolled in the class, this notebook will be graded using [Gradescope](www.gradescope.com). You should have gotten the enrollement code on our announcement in Piazza. \n",
    "\n",
    "For submission of this assignment, you must do two things. \n",
    "- Download and submit the notebook `label_generation.ipynb` to Gradescope's notebook submission section, along with your notebook for the other problems.\n",
    "- Write down your answers to 6.1.b and 6.1.c to a separately pdf file and submit it to Gradescope's written submission section. \n",
    "\n",
    "We will evaluate the local functions in the notebook to see if the function behaves as we have expected. For this exercise, the rubric is as follows:\n",
    "- [3 pts] Correct Implementation of `deproject_pW_to_image` method.\n",
    "- [2 pts] Analysis for Cluttered Scenes: reasonable answers and explanations. \n",
    "- [2 pts] Visually reasonable output from `augment_mustard_bottle`.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 381
    },
    "id": "mKlkgkjjXIKP",
    "outputId": "c1932c14-be4f-4b4e-fe37-42984518829a"
   },
   "outputs": [],
   "source": [
    "from manipulation.exercises.segmentation.test_mask import TestMask\n",
    "from manipulation.exercises.grader import Grader \n",
    "\n",
    "Grader.grade_output([TestMask], [locals()], 'results.json')\n",
    "Grader.print_test_results('results.json')"
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Tags",
  "colab": {
   "collapsed_sections": [
    "gewjvw0krZOX",
    "2fj0D-mHrZOn",
    "5lOkEMMArZOo",
    "MwE8yNg58VQN"
   ],
   "name": "Mask-RCNN Label Generation.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}